knitr::include_graphics("group.gif")Mulltinomial Logistic Regression Assignment
- Workflow
Introduction
Preparation
Read data
Data Wrangling
Create new categorical variable from fbs
exploratory data analysis
Confirm the order of fbs
Estimation
VGAM Package
NNET Package
- Inferences
VGAM packages
NNET package
- Prediction
Predict the log odss
Predict the probability
Result
Interpretation
1 Workflow
2 Introduction
The dataset contains 4,092 rows and 21 features collected from community-based health screening data. One of the key variables of interest is fasting blood sugar (FBS), which has been categorized into three clinical classes:
Normal (FBS < 5.6 mmol/L)
Prediabetes (FBS 5.6–6.9 mmol/L)
Diabetes (FBS ≥ 7.0 mmol/L)
The dataset includes a variety of demographic, anthropometric, and biochemical features such as:
Demographic data: age, gender, rural/urban locality
Lifestyle indicators: smoking status, hypertension history
Anthropometric measures: height, weight, waist and hip circumference
Blood pressure: mean systolic and diastolic blood pressure
Biochemical values: HbA1c, cholesterol profiles (HDL, LDL, triglycerides, total cholesterol), and oral glucose tolerance test (OGTT) values
This dataset is suitable for exploring predictors of abnormal fasting blood sugar levels using multinomial logistic regression.
3 Preparation
library(here)
library(tidyverse)
library(haven)
library(gtsummary)
library(VGAM)
library(nnet)
library(broom)
library(knitr)
library(kableExtra)
library(tibble)
library(purrr)
library(gt)
library(ggplot2)
library(ggeffects)
library(reshape2)
library(data.table)
library(foreign)
library(gridExtra)
library(grid)
library(viridis)
library(ggpubr)
library(rmarkdown)
library(readxl)4 Read data
datafbs <- read_csv("datamssm_b.csv")
glimpse(datafbs)Rows: 4,092
Columns: 21
$ codesub <chr> "AHA215459", "LAW215133", "SAL215736", "MJB216145", "TIS22632…
$ age <dbl> 44, 64, 41, 34, 58, 39, 48, 41, 34, 22, 38, 47, 33, 20, 51, 4…
$ hpt <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
$ smoking <chr> "still smoking", "never smoked", "never smoked", "still smoki…
$ dmdx <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
$ height <dbl> 1.63, 1.59, 1.46, 1.52, 1.48, 1.53, 1.48, 1.52, 1.55, 1.65, 1…
$ weight <dbl> 52.65, 55.30, 49.90, 48.70, 43.35, 60.10, 70.70, 63.65, 50.00…
$ waist <dbl> 71.0, 83.5, 70.0, 75.0, 78.1, 83.0, 102.6, 90.0, 73.0, 75.0, …
$ hip <dbl> 85.0, 89.9, 88.0, 86.7, 87.8, 93.5, 102.9, 104.0, 87.0, 90.0,…
$ msbpr <dbl> 124.5, 157.5, 115.5, 111.0, 140.5, 160.0, 119.5, 150.0, 110.5…
$ mdbpr <dbl> 73.5, 68.0, 64.5, 68.5, 68.5, 99.5, 76.0, 82.0, 73.5, 79.0, 7…
$ hba1c <dbl> 4.2, 5.3, 5.0, 5.1, 5.2, 5.9, 5.5, 4.4, 5.1, 4.9, 5.3, 5.5, 5…
$ fbs <dbl> 2.50, 2.50, 2.51, 2.52, 2.52, 2.52, 2.52, 2.52, 2.55, 2.55, 2…
$ mogtt1h <dbl> 2.04, 9.99, 7.82, 5.67, 10.28, 11.44, 6.31, 9.00, 5.03, 4.30,…
$ mogtt2h <dbl> 3.30, 8.44, 6.42, 5.45, 4.56, 6.95, 5.58, 7.07, 4.55, 5.34, 7…
$ totchol <dbl> 4.18, 5.27, 5.10, 5.55, 6.72, 6.69, 7.28, 5.74, 3.48, 4.08, 4…
$ ftrigliz <dbl> 1.20, 1.28, 0.68, 0.86, 1.13, 1.59, 2.18, 1.24, 0.26, 0.96, 0…
$ hdl <dbl> 1.07, 0.99, 1.47, 1.49, 1.46, 1.28, 1.20, 1.22, 1.04, 0.81, 1…
$ ldl <dbl> 2.43, 3.64, 2.73, 3.56, 4.54, 4.08, 5.02, 3.70, 2.01, 2.34, 2…
$ gender <chr> "male", "male", "female", "male", "female", "female", "female…
$ crural <chr> "rural", "rural", "rural", "rural", "urban", "rural", "rural"…
5 Convert character to factor
datafbs <- datafbs %>%
mutate(across(where(is.character), as.factor))
glimpse(datafbs)Rows: 4,092
Columns: 21
$ codesub <fct> AHA215459, LAW215133, SAL215736, MJB216145, TIS226328, red115…
$ age <dbl> 44, 64, 41, 34, 58, 39, 48, 41, 34, 22, 38, 47, 33, 20, 51, 4…
$ hpt <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, n…
$ smoking <fct> still smoking, never smoked, never smoked, still smoking, nev…
$ dmdx <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, n…
$ height <dbl> 1.63, 1.59, 1.46, 1.52, 1.48, 1.53, 1.48, 1.52, 1.55, 1.65, 1…
$ weight <dbl> 52.65, 55.30, 49.90, 48.70, 43.35, 60.10, 70.70, 63.65, 50.00…
$ waist <dbl> 71.0, 83.5, 70.0, 75.0, 78.1, 83.0, 102.6, 90.0, 73.0, 75.0, …
$ hip <dbl> 85.0, 89.9, 88.0, 86.7, 87.8, 93.5, 102.9, 104.0, 87.0, 90.0,…
$ msbpr <dbl> 124.5, 157.5, 115.5, 111.0, 140.5, 160.0, 119.5, 150.0, 110.5…
$ mdbpr <dbl> 73.5, 68.0, 64.5, 68.5, 68.5, 99.5, 76.0, 82.0, 73.5, 79.0, 7…
$ hba1c <dbl> 4.2, 5.3, 5.0, 5.1, 5.2, 5.9, 5.5, 4.4, 5.1, 4.9, 5.3, 5.5, 5…
$ fbs <dbl> 2.50, 2.50, 2.51, 2.52, 2.52, 2.52, 2.52, 2.52, 2.55, 2.55, 2…
$ mogtt1h <dbl> 2.04, 9.99, 7.82, 5.67, 10.28, 11.44, 6.31, 9.00, 5.03, 4.30,…
$ mogtt2h <dbl> 3.30, 8.44, 6.42, 5.45, 4.56, 6.95, 5.58, 7.07, 4.55, 5.34, 7…
$ totchol <dbl> 4.18, 5.27, 5.10, 5.55, 6.72, 6.69, 7.28, 5.74, 3.48, 4.08, 4…
$ ftrigliz <dbl> 1.20, 1.28, 0.68, 0.86, 1.13, 1.59, 2.18, 1.24, 0.26, 0.96, 0…
$ hdl <dbl> 1.07, 0.99, 1.47, 1.49, 1.46, 1.28, 1.20, 1.22, 1.04, 0.81, 1…
$ ldl <dbl> 2.43, 3.64, 2.73, 3.56, 4.54, 4.08, 5.02, 3.70, 2.01, 2.34, 2…
$ gender <fct> male, male, female, male, female, female, female, female, fem…
$ crural <fct> rural, rural, rural, rural, urban, rural, rural, urban, rural…
6 Data wrangling
datafbs <- datafbs %>%
select(fbs, hba1c, ftrigliz, age, waist)
# inspect the selected data
glimpse(datafbs)Rows: 4,092
Columns: 5
$ fbs <dbl> 2.50, 2.50, 2.51, 2.52, 2.52, 2.52, 2.52, 2.52, 2.55, 2.55, 2…
$ hba1c <dbl> 4.2, 5.3, 5.0, 5.1, 5.2, 5.9, 5.5, 4.4, 5.1, 4.9, 5.3, 5.5, 5…
$ ftrigliz <dbl> 1.20, 1.28, 0.68, 0.86, 1.13, 1.59, 2.18, 1.24, 0.26, 0.96, 0…
$ age <dbl> 44, 64, 41, 34, 58, 39, 48, 41, 34, 22, 38, 47, 33, 20, 51, 4…
$ waist <dbl> 71.0, 83.5, 70.0, 75.0, 78.1, 83.0, 102.6, 90.0, 73.0, 75.0, …
6.1 Rename column
datafbs <- datafbs %>%
rename(
fbs_raw = fbs,
hba1c = hba1c,
triglycerides = ftrigliz,
age = age,
waist_circumference = waist
)7 Create new categorical variable of outcome (cat_fbs)
datafbs <- datafbs %>%
mutate(
cat_fbs = case_when(
fbs_raw < 5.6 ~ "normal",
fbs_raw >= 5.6 & fbs_raw < 7.0 ~ "prediabetes",
fbs_raw >= 7.0 ~ "diabetes"
),
cat_fbs = factor(cat_fbs, levels = c("diabetes", "prediabetes", "normal"))
)
summary(datafbs) fbs_raw hba1c triglycerides age
Min. : 2.500 Min. : 0.200 Min. : 0.110 Min. :18.00
1st Qu.: 4.480 1st Qu.: 5.100 1st Qu.: 0.930 1st Qu.:38.00
Median : 5.180 Median : 5.400 Median : 1.260 Median :48.00
Mean : 5.734 Mean : 5.829 Mean : 1.541 Mean :48.05
3rd Qu.: 6.020 3rd Qu.: 5.800 3rd Qu.: 1.780 3rd Qu.:58.00
Max. :28.010 Max. :15.000 Max. :12.660 Max. :89.00
NA's :22 NA's :4
waist_circumference cat_fbs
Min. : 50.80 diabetes : 563
1st Qu.: 77.00 prediabetes: 889
Median : 86.00 normal :2640
Mean : 86.44
3rd Qu.: 95.00
Max. :154.50
NA's :2
datafbs %>%
count(cat_fbs)# select predictors and outcome
datafbs <- datafbs %>%
select(cat_fbs, hba1c, triglycerides, age, waist_circumference)8 Exploratory Data Analysis
# relevel cat_fbs into desired order
datafbs <- datafbs %>%
mutate(cat_fbs = factor(cat_fbs, levels = c("normal", "prediabetes", "diabetes")))
# create table summary
datafbs %>%
tbl_summary(
by = cat_fbs,
statistic = list(all_continuous() ~ "{mean} ({sd})"),
digits = all_continuous() ~ 2,
missing = "no"
) %>%
add_overall() %>%
bold_labels()| Characteristic | Overall N = 4,0921 |
normal N = 2,6401 |
prediabetes N = 8891 |
diabetes N = 5631 |
|---|---|---|---|---|
| hba1c | 5.83 (1.46) | 5.38 (0.67) | 5.76 (0.96) | 8.05 (2.46) |
| triglycerides | 1.54 (1.07) | 1.37 (0.88) | 1.70 (1.10) | 2.11 (1.50) |
| age | 48.05 (14.48) | 45.50 (14.67) | 52.53 (13.35) | 52.98 (12.09) |
| waist_circumference | 86.44 (13.04) | 84.47 (12.93) | 89.21 (12.42) | 91.31 (12.49) |
| 1 Mean (SD) | ||||
8.1 Plots
# Count cat_fbs categories and calculate percentages
fbs_counts <- datafbs %>% count(cat_fbs)
fbs_counts$percent <- round(fbs_counts$n / sum(fbs_counts$n) * 100, 2)
# Define custom colors for the categories
custom_colors <- c('#A6CEE3', '#FDBF6F', '#B2DF8A') # Light blue, orange, green
# Pie chart for cat_fbs
p1 <- ggplot(fbs_counts, aes(x = "", y = n, fill = cat_fbs)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
geom_text(aes(label = paste0(percent, "%")), position = position_stack(vjust = 0.5)) +
labs(title = "FBS Category Distribution", fill = "FBS Category") +
scale_fill_manual(values = custom_colors) +
theme_void()
# Bar chart for cat_fbs
p2 <- ggplot(fbs_counts, aes(x = cat_fbs, y = n, fill = cat_fbs)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = -0.5) +
labs(title = "FBS Category Count", x = "FBS Category", y = "Count") +
scale_fill_manual(values = custom_colors) +
theme_minimal()
# Combine both plots into one row
gridExtra::grid.arrange(p1, p2, ncol = 2, top = grid::textGrob(
"Distribution of FBS Categories",
gp = grid::gpar(fontsize = 20, fontface = "bold")
))Use histogram since all variables are numerical
8.1.1 Hba1c
datafbs |>
ggplot(aes(hba1c)) +
geom_histogram() +
facet_grid(. ~ cat_fbs)8.1.2 Age
datafbs |>
ggplot(aes(age)) +
geom_histogram() +
facet_grid(. ~ cat_fbs)8.1.3 Triglycerides
datafbs |>
ggplot(aes(triglycerides)) +
geom_histogram() +
facet_grid(. ~ cat_fbs)8.1.4 Waist circumference
datafbs |>
ggplot(aes(waist_circumference)) +
geom_histogram() +
facet_grid(. ~ cat_fbs)9 Confirm the order of cat_fbs
levels(datafbs$cat_fbs)[1] "normal" "prediabetes" "diabetes"
However, we would like the diabetes as the smallest category. To do that we will use the fct_relevel()function.
datafbs <- datafbs %>%
mutate(cat_fbs = fct_relevel(cat_fbs,
c("diabetes", 'prediabetes', 'normal')))
levels(datafbs$cat_fbs)[1] "diabetes" "prediabetes" "normal"
10 Estimation
10.1 VGAM Package
Our intention to investigate the relationship between hba1c, age, triglyceride level, and waist circumference with the outcome variables cat_fbs. Thus, we will perform multinomial logistic regression model to estimate the relation for 2 models:
Model 1: Diabetes vs Normal
Model 2: Prediabetes vs Normal
In both models, the reference group is Normal
10.2 Single Independent Variable
10.2.1 Hba1c
log_hba1c <- vglm(cat_fbs ~ hba1c,
multinomial, data = datafbs)
summary(log_hba1c)
Call:
vglm(formula = cat_fbs ~ hba1c, family = multinomial, data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -10.42508 0.37083 -28.11 <2e-16 ***
(Intercept):2 -5.16554 0.31937 -16.17 <2e-16 ***
hba1c:1 1.47018 0.06190 23.75 <2e-16 ***
hba1c:2 0.73734 0.05706 12.92 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 5920.927 on 8136 degrees of freedom
Log-likelihood: -2960.463 on 8136 degrees of freedom
Number of Fisher scoring iterations: 6
Warning: Hauck-Donner effect detected in the following estimate(s):
'hba1c:1'
Reference group is level 3 of the response
10.2.2 Triglycerides
log_triglycerides <- vglm(cat_fbs ~ triglycerides,
multinomial, data = datafbs)
summary(log_triglycerides)
Call:
vglm(formula = cat_fbs ~ triglycerides, family = multinomial,
data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -2.47972 0.08617 -28.776 <2e-16 ***
(Intercept):2 -1.65108 0.07428 -22.227 <2e-16 ***
triglycerides:1 0.57088 0.04204 13.578 <2e-16 ***
triglycerides:2 0.37324 0.04094 9.116 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 7029.323 on 8172 degrees of freedom
Log-likelihood: -3514.661 on 8172 degrees of freedom
Number of Fisher scoring iterations: 4
No Hauck-Donner effect found in any of the estimates
Reference group is level 3 of the response
10.2.3 Age
log_age <- vglm(cat_fbs ~ age,
multinomial, data = datafbs)
summary(log_age)
Call:
vglm(formula = cat_fbs ~ age, family = multinomial, data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -3.402279 0.182518 -18.64 <2e-16 ***
(Intercept):2 -2.821518 0.150325 -18.77 <2e-16 ***
age:1 0.037696 0.003416 11.04 <2e-16 ***
age:2 0.035347 0.002860 12.36 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 7023.23 on 8180 degrees of freedom
Log-likelihood: -3511.615 on 8180 degrees of freedom
Number of Fisher scoring iterations: 4
Warning: Hauck-Donner effect detected in the following estimate(s):
'(Intercept):1'
Reference group is level 3 of the response
10.2.4 Waist Circumference
log_waist <- vglm(cat_fbs ~ waist_circumference,
multinomial, data = datafbs)
summary(log_waist)
Call:
vglm(formula = cat_fbs ~ waist_circumference, family = multinomial,
data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -5.099767 0.326151 -15.636 <2e-16 ***
(Intercept):2 -3.604592 0.271383 -13.282 <2e-16 ***
waist_circumference:1 0.040485 0.003596 11.257 <2e-16 ***
waist_circumference:2 0.028998 0.003057 9.485 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 7077.815 on 8176 degrees of freedom
Log-likelihood: -3538.908 on 8176 degrees of freedom
Number of Fisher scoring iterations: 4
Warning: Hauck-Donner effect detected in the following estimate(s):
'(Intercept):1'
Reference group is level 3 of the response
10.3 Multiple Independent Variables
We feel that hba1c, triglycerides level, age, and waist circumference are all important independent variables. Hence, we want to fit a model with the four independent variables as the covariates.
mlog <- vglm(cat_fbs ~ hba1c + triglycerides + age + waist_circumference,
family = multinomial, data = datafbs)
summary(mlog)
Call:
vglm(formula = cat_fbs ~ hba1c + triglycerides + age + waist_circumference,
family = multinomial, data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -11.985830 0.551930 -21.716 < 2e-16 ***
(Intercept):2 -6.876354 0.390274 -17.619 < 2e-16 ***
hba1c:1 1.244479 0.060070 20.717 < 2e-16 ***
hba1c:2 0.482128 0.057903 8.327 < 2e-16 ***
triglycerides:1 0.326086 0.050564 6.449 1.13e-10 ***
triglycerides:2 0.212952 0.040810 5.218 1.81e-07 ***
age:1 0.022920 0.004621 4.960 7.06e-07 ***
age:2 0.026560 0.002993 8.874 < 2e-16 ***
waist_circumference:1 0.013298 0.004818 2.760 0.00578 **
waist_circumference:2 0.017140 0.003273 5.236 1.64e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 5717.238 on 8118 degrees of freedom
Log-likelihood: -2858.619 on 8118 degrees of freedom
Number of Fisher scoring iterations: 6
Warning: Hauck-Donner effect detected in the following estimate(s):
'(Intercept):1', '(Intercept):2', 'hba1c:1'
Reference group is level 3 of the response
11 Model with interaction term
Then, we hypothesize that there could be a significant interaction between age and waist circumference. And to test the hypothesis, we extend the multivariable logistic regression model by adding an interaction term.
mlog_interaction <- vglm(cat_fbs ~ hba1c + triglycerides + age + waist_circumference +
age*waist_circumference,
family = multinomial,
data = datafbs)
summary(mlog_interaction)
Call:
vglm(formula = cat_fbs ~ hba1c + triglycerides + age + waist_circumference +
age * waist_circumference, family = multinomial, data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -1.441e+01 1.572e+00 -9.163 < 2e-16 ***
(Intercept):2 -7.492e+00 1.000e+00 -7.492 6.78e-14 ***
hba1c:1 1.250e+00 6.024e-02 20.745 < 2e-16 ***
hba1c:2 4.843e-01 5.797e-02 8.355 < 2e-16 ***
triglycerides:1 3.242e-01 5.073e-02 6.391 1.65e-10 ***
triglycerides:2 2.127e-01 4.081e-02 5.213 1.85e-07 ***
age:1 7.090e-02 2.924e-02 2.424 0.0153 *
age:2 3.945e-02 1.890e-02 2.088 0.0368 *
waist_circumference:1 4.056e-02 1.705e-02 2.378 0.0174 *
waist_circumference:2 2.432e-02 1.108e-02 2.196 0.0281 *
age:waist_circumference:1 -5.463e-04 3.272e-04 -1.669 0.0950 .
age:waist_circumference:2 -1.527e-04 2.171e-04 -0.703 0.4818
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 5714.41 on 8116 degrees of freedom
Log-likelihood: -2857.205 on 8116 degrees of freedom
Number of Fisher scoring iterations: 6
Warning: Hauck-Donner effect detected in the following estimate(s):
'(Intercept):1', '(Intercept):2', 'hba1c:1'
Reference group is level 3 of the response
The interaction term in our model showed p-values above 0.05. As the p-value is bigger than the level of significance at 5% and the value of regression parameters for the interaction terms are likely not clinically meaningful, we have decided not to use the model with an interaction term.
12 NNET Package
Unlike VGAM::vglm function - where the reference or the base outcome is the largest group (level) - the nnet::multinom uses the smallest group (level) as the reference or base outcome.
12.1 relevel the outcome variables to make normal as reference group
datafbs <- datafbs %>%
mutate(cat_fbs_relev = relevel(cat_fbs, ref = "normal"))
levels(datafbs$cat_fbs_relev)[1] "normal" "diabetes" "prediabetes"
12.2 Fit multinomial logistic regression using nnet::multinom()
mlog_nnet <- multinom(cat_fbs_relev ~ hba1c + triglycerides + age + waist_circumference, data = datafbs)# weights: 18 (10 variable)
initial value 4464.760341
iter 10 value 3167.437397
iter 20 value 2858.619099
final value 2858.619059
converged
summary(mlog_nnet)Call:
multinom(formula = cat_fbs_relev ~ hba1c + triglycerides + age +
waist_circumference, data = datafbs)
Coefficients:
(Intercept) hba1c triglycerides age waist_circumference
diabetes -11.985584 1.2444619 0.3260688 0.02292334 0.01329567
prediabetes -6.876298 0.4821214 0.2129488 0.02655990 0.01713959
Std. Errors:
(Intercept) hba1c triglycerides age
diabetes 0.5519231 0.06006911 0.05056369 0.004621221
prediabetes 0.3902720 0.05790213 0.04080945 0.002992863
waist_circumference
diabetes 0.004818100
prediabetes 0.003273272
Residual Deviance: 5717.238
AIC: 5737.238
12.3 Comparing objects from VGAM::vglm and nnet::multinom
so running multinom will give result as above. Now lets compare the result with vglm
summary(mlog)
Call:
vglm(formula = cat_fbs ~ hba1c + triglycerides + age + waist_circumference,
family = multinomial, data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -11.985830 0.551930 -21.716 < 2e-16 ***
(Intercept):2 -6.876354 0.390274 -17.619 < 2e-16 ***
hba1c:1 1.244479 0.060070 20.717 < 2e-16 ***
hba1c:2 0.482128 0.057903 8.327 < 2e-16 ***
triglycerides:1 0.326086 0.050564 6.449 1.13e-10 ***
triglycerides:2 0.212952 0.040810 5.218 1.81e-07 ***
age:1 0.022920 0.004621 4.960 7.06e-07 ***
age:2 0.026560 0.002993 8.874 < 2e-16 ***
waist_circumference:1 0.013298 0.004818 2.760 0.00578 **
waist_circumference:2 0.017140 0.003273 5.236 1.64e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 5717.238 on 8118 degrees of freedom
Log-likelihood: -2858.619 on 8118 degrees of freedom
Number of Fisher scoring iterations: 6
Warning: Hauck-Donner effect detected in the following estimate(s):
'(Intercept):1', '(Intercept):2', 'hba1c:1'
Reference group is level 3 of the response
13 Inference
13.1 VGAM Package
For the inference, we will: 1. calculate the 95% CI (interval estimates) 2. calculate the p-values (hypothesis testing)
There is no facility inside thebroom::tidy() function to generate confidence intervals for object with class vglm. Because of that we will use the coef(), confint() and cind() functions to produce a rather nice table of inferences.
We are going to follow these steps: 1. set the number of digits equal to 2 to limit the decimal numbers 2. return the regression coefficents for all ^β as an object named b_fitmlog2 3. return the the confidence intervals for all ^β as an object named ci_fitmlog2 4. combine the ^β and the corresponding 95% CIs
b_mlog <- coef(mlog)
ci_mlog <- confint(mlog)
b_ci_mlog <- data.frame(b_mlog,ci_mlog) %>%
rename("log odds" = b_mlog, "Lower CI" = X2.5.., "Upper CI" = X97.5..)
b_ci_mlog %>%
kbl(digits = 2, booktabs = T, caption = "Log odds from multinomial logistic regression") %>%
kable_styling(position = "center")| log odds | Lower CI | Upper CI | |
|---|---|---|---|
| (Intercept):1 | -11.99 | -13.07 | -10.90 |
| (Intercept):2 | -6.88 | -7.64 | -6.11 |
| hba1c:1 | 1.24 | 1.13 | 1.36 |
| hba1c:2 | 0.48 | 0.37 | 0.60 |
| triglycerides:1 | 0.33 | 0.23 | 0.43 |
| triglycerides:2 | 0.21 | 0.13 | 0.29 |
| age:1 | 0.02 | 0.01 | 0.03 |
| age:2 | 0.03 | 0.02 | 0.03 |
| waist_circumference:1 | 0.01 | 0.00 | 0.02 |
| waist_circumference:2 | 0.02 | 0.01 | 0.02 |
Afterwards, we will exponentiate the coefficients to obtain the relative-risk ratio. We then combine the results to the previous table. Finally, we will name the columns of the object tab_fitmlog2.
rrr_mlog <- exp(b_ci_mlog)
tab_mlog <- cbind(b_ci_mlog, rrr_mlog)
colnames(tab_mlog) <- c('b', 'lower b', 'upper b',
'RRR', 'lower RRR', 'upper RRR')
tab_mlog %>%
kbl(digits = 2, booktabs = T, caption = "Log odds and RRR from multinomial logistic regression") %>%
kable_styling(position = "center")| b | lower b | upper b | RRR | lower RRR | upper RRR | |
|---|---|---|---|---|---|---|
| (Intercept):1 | -11.99 | -13.07 | -10.90 | 0.00 | 0.00 | 0.00 |
| (Intercept):2 | -6.88 | -7.64 | -6.11 | 0.00 | 0.00 | 0.00 |
| hba1c:1 | 1.24 | 1.13 | 1.36 | 3.47 | 3.09 | 3.90 |
| hba1c:2 | 0.48 | 0.37 | 0.60 | 1.62 | 1.45 | 1.81 |
| triglycerides:1 | 0.33 | 0.23 | 0.43 | 1.39 | 1.25 | 1.53 |
| triglycerides:2 | 0.21 | 0.13 | 0.29 | 1.24 | 1.14 | 1.34 |
| age:1 | 0.02 | 0.01 | 0.03 | 1.02 | 1.01 | 1.03 |
| age:2 | 0.03 | 0.02 | 0.03 | 1.03 | 1.02 | 1.03 |
| waist_circumference:1 | 0.01 | 0.00 | 0.02 | 1.01 | 1.00 | 1.02 |
| waist_circumference:2 | 0.02 | 0.01 | 0.02 | 1.02 | 1.01 | 1.02 |
# Build final table using provided values
data <- data.frame(
Group = c("Diabetes", "", "", "Prediabetes", "", ""),
Variable = c("Intercept", "hba1c", "triglycerides", "Intercept", "hba1c", "triglycerides"),
B = c(-11.99, 1.24, 0.33, -6.88, 0.48, 0.21),
SE = c(0.55, 0.06, 0.05, 0.39, 0.06, 0.04),
Wald = c(-21.72, 20.72, 6.45, -17.62, 8.33, 5.22),
p = c("<0.001", "<0.001", "<0.001", "<0.001", "<0.001", "<0.001"),
OR = c(0.00, 3.47, 1.39, 0.00, 1.62, 1.24),
`95%CI` = c(
"(0.00, 0.00)", "(3.09, 3.90)", "(1.25, 1.53)",
"(0.00, 0.00)", "(1.45, 1.81)", "(1.14, 1.34)"
)
)
# Create styled table
kbl(data, booktabs = TRUE,
col.names = c("Group", "Variable", "B", "SE", "Wald", "p", "OR", "95% CI"),
caption = "Table 2: Log odds and RRR from multinomial logistic regression") %>%
kable_styling(full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE, border_right = TRUE) %>%
row_spec(0, bold = TRUE, color = "white", background = "#D7261E") %>%
footnote(general = "a The reference group is normal")| Group | Variable | B | SE | Wald | p | OR | 95% CI |
|---|---|---|---|---|---|---|---|
| Diabetes | Intercept | -11.99 | 0.55 | -21.72 | <0.001 | 0.00 | (0.00, 0.00) |
| hba1c | 1.24 | 0.06 | 20.72 | <0.001 | 3.47 | (3.09, 3.90) | |
| triglycerides | 0.33 | 0.05 | 6.45 | <0.001 | 1.39 | (1.25, 1.53) | |
| Prediabetes | Intercept | -6.88 | 0.39 | -17.62 | <0.001 | 0.00 | (0.00, 0.00) |
| hba1c | 0.48 | 0.06 | 8.33 | <0.001 | 1.62 | (1.45, 1.81) | |
| triglycerides | 0.21 | 0.04 | 5.22 | <0.001 | 1.24 | (1.14, 1.34) | |
| Note: | |||||||
| a The reference group is normal |
13.2 NNET Package
13.2.1 P-value
z.test <- summary(mlog_nnet)$coefficients/summary(mlog_nnet)$standard.errors
# 2-tailed
p.val <- (1 - pnorm(abs(z.test), 0, 1)) * 2
colnames(p.val) <- c('p-val intercept', 'p-val hba1c', 'p-val triglycerides', 'p-val age', 'p-val waist_circumference')
p.val p-val intercept p-val hba1c p-val triglycerides p-val age
diabetes 0 0 1.128333e-10 7.032938e-07
prediabetes 0 0 1.807447e-07 0.000000e+00
p-val waist_circumference
diabetes 5.788540e-03
prediabetes 1.638923e-07
13.2.2 CI for nnet::multinom
confint(mlog_nnet, level=0.95), , diabetes
2.5 % 97.5 %
(Intercept) -13.067333280 -10.90383468
hba1c 1.126728603 1.36219519
triglycerides 0.226965738 0.42517177
age 0.013865917 0.03198077
waist_circumference 0.003852367 0.02273897
, , prediabetes
2.5 % 97.5 %
(Intercept) -7.6412167 -6.11137860
hba1c 0.3686353 0.59560754
triglycerides 0.1329637 0.29293383
age 0.0206940 0.03242581
waist_circumference 0.0107241 0.02355509
14 Prediction
14.1 Predict the log odds
summary(log_hba1c)
Call:
vglm(formula = cat_fbs ~ hba1c, family = multinomial, data = datafbs)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -10.42508 0.37083 -28.11 <2e-16 ***
(Intercept):2 -5.16554 0.31937 -16.17 <2e-16 ***
hba1c:1 1.47018 0.06190 23.75 <2e-16 ***
hba1c:2 0.73734 0.05706 12.92 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 5920.927 on 8136 degrees of freedom
Log-likelihood: -2960.463 on 8136 degrees of freedom
Number of Fisher scoring iterations: 6
Warning: Hauck-Donner effect detected in the following estimate(s):
'hba1c:1'
Reference group is level 3 of the response
The predicted log odds for the first 6 observations:
the predicted log odds for diabetes vs normal in column 1
the predicted log odds for prediabetes vs normal in column 2
head(predict.vgam(log_hba1c, type = 'link')) log(mu[,1]/mu[,3]) log(mu[,2]/mu[,3])
[1,] -4.250306 -2.0687082
[2,] -2.633104 -1.2576329
[3,] -3.074159 -1.4788352
[4,] -2.927141 -1.4051011
[5,] -2.780122 -1.3313670
[6,] -1.750994 -0.8152282
You can verify these prediction manually. For example the calculations for:
the 1st observation log odds
the 3rd observation log odds
head(datafbs)[1:3,]The values for the
the 1st observation is hba1c = 4.2
the 3rd observation is hba1c = 5.0
# ptn 1: hba1c = 4.2
# logit cat_fbs=diabetes = [1] vs cat_fbs=normal =[3]
#-10.42508 + 1.47018 *4.2
-10.42508 + 1.47018 *4.2[1] -4.250324
# logit cat_fbs=prediabetes = [2] vs cat_fbs=normal =[3]
#-5.16554 + 0.73734*4.2
-5.16554 + 0.73734*4.2[1] -2.068712
# ptn 3: hba1c = 5.0
# logit cat_fbs=diabetes = [1] vs cat_fbs=normal =[3]
#-10.42508 + 1.47018 *5.0
-10.42508 + 1.47018 *5.0[1] -3.07418
# logit cat_fbs=prediabetes = [2] vs cat_fbs=normal =[3]
#-5.16554 + 0.73734*5.0
-5.16554 + 0.73734*5.0[1] -1.47884
15 Predict the probability
The predicted probability for the first 6 observation
head(predict.vgam(log_hba1c, type = 'response')) diabetes prediabetes normal
1 0.01250198 0.1107732 0.8767248
2 0.05298339 0.2096521 0.7373645
3 0.03628235 0.1788693 0.7848484
4 0.04122739 0.1888858 0.7698868
5 0.04677530 0.1991604 0.7540643
6 0.10741729 0.2738243 0.6187584
Manual calculation for probability. Let us take the first observation where,
log odds for group diabetes: -4.250306
log odds for group prediabetes: -2.0687082
# probability being in the reference group (cat_fbs == normal = [3])
# 1/(1 + exp(-4.250306) + exp(-2.0687082)
1/(1 + exp( -4.250306 ) + exp(-2.0687082))[1] 0.8767248
# probability being in the prediabetes group (cat_fbs == prediabetes = [2])
# exp(-2.0687082)/(1 + exp(-4.250306) + exp(-2.0687082)
exp(-2.0687082)/(1 + exp( -4.250306 ) + exp(-2.0687082))[1] 0.1107732
# probability being in the diabetes group (cat_fbs == diabetes = [1])
# exp(-4.250306)/(1 + exp(-4.250306) + exp(-2.0687082)
exp(-4.250306)/(1 + exp( -4.250306 ) + exp(-2.0687082))[1] 0.01250198
16 Results
# Assuming datafbs is your data frame
# Creating the summary table with the caption
datafbs %>%
select(cat_fbs, hba1c, triglycerides, age, waist_circumference) %>%
tbl_summary(
by = cat_fbs,
statistic = all_continuous() ~ "{mean} ({sd})",
digits = all_continuous() ~ 2,
missing = "no",
label = list(
hba1c ~ "HbA1c",
triglycerides ~ "Triglycerides",
age ~ "Age",
waist_circumference ~ "Waist Circumference"
)
) %>%
add_overall() %>%
modify_caption("**Table 1: Characteristics of FBS Categories**") %>%
bold_labels()| Characteristic | Overall N = 4,0921 |
diabetes N = 5631 |
prediabetes N = 8891 |
normal N = 2,6401 |
|---|---|---|---|---|
| HbA1c | 5.83 (1.46) | 8.05 (2.46) | 5.76 (0.96) | 5.38 (0.67) |
| Triglycerides | 1.54 (1.07) | 2.11 (1.50) | 1.70 (1.10) | 1.37 (0.88) |
| Age | 48.05 (14.48) | 52.98 (12.09) | 52.53 (13.35) | 45.50 (14.67) |
| Waist Circumference | 86.44 (13.04) | 91.31 (12.49) | 89.21 (12.42) | 84.47 (12.93) |
| 1 Mean (SD) | ||||
library(kableExtra)
# Updated data with 4 predictors
data <- data.frame(
Group = c("Diabetes", "", "", "", "",
"Prediabetes", "", "", "", ""),
Variable = c("Intercept", "hba1c", "triglycerides", "age", "waist_circumference",
"Intercept", "hba1c", "triglycerides", "age", "waist_circumference"),
B = c(-11.99, 1.24, 0.33, 0.02, 0.01,
-6.88, 0.48, 0.21, 0.03, 0.02),
SE = c(0.55, 0.06, 0.05, 0.005, 0.005,
0.39, 0.06, 0.04, 0.003, 0.003),
Wald = c(-21.72, 20.72, 6.45, 4.96, 2.76,
-17.62, 8.33, 5.22, 8.87, 5.24),
p = c("<0.001", "<0.001", "<0.001", "<0.001", "0.006",
"<0.001", "<0.001", "<0.001", "<0.001", "<0.001"),
OR = c(0.00, 3.47, 1.39, 1.02, 1.01,
0.00, 1.62, 1.24, 1.03, 1.02),
`95%CI` = c(
"(0.00, 0.00)", "(3.09, 3.90)", "(1.25, 1.53)", "(1.01, 1.03)", "(1.00, 1.02)",
"(0.00, 0.00)", "(1.45, 1.81)", "(1.14, 1.34)", "(1.02, 1.03)", "(1.01, 1.02)"
)
)
# Create styled table
kbl(data, booktabs = TRUE,
col.names = c("Group", "Variable", "B", "SE", "Wald", "p", "OR", "95% CI"),
caption = "Table 2: Log odds and RRR from multinomial logistic regression") %>%
kable_styling(full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE, border_right = TRUE) %>%
row_spec(0, bold = TRUE, color = "white", background = "#D7261E") %>%
footnote(general = "a The reference group is normal")| Group | Variable | B | SE | Wald | p | OR | 95% CI |
|---|---|---|---|---|---|---|---|
| Diabetes | Intercept | -11.99 | 0.550 | -21.72 | <0.001 | 0.00 | (0.00, 0.00) |
| hba1c | 1.24 | 0.060 | 20.72 | <0.001 | 3.47 | (3.09, 3.90) | |
| triglycerides | 0.33 | 0.050 | 6.45 | <0.001 | 1.39 | (1.25, 1.53) | |
| age | 0.02 | 0.005 | 4.96 | <0.001 | 1.02 | (1.01, 1.03) | |
| waist_circumference | 0.01 | 0.005 | 2.76 | 0.006 | 1.01 | (1.00, 1.02) | |
| Prediabetes | Intercept | -6.88 | 0.390 | -17.62 | <0.001 | 0.00 | (0.00, 0.00) |
| hba1c | 0.48 | 0.060 | 8.33 | <0.001 | 1.62 | (1.45, 1.81) | |
| triglycerides | 0.21 | 0.040 | 5.22 | <0.001 | 1.24 | (1.14, 1.34) | |
| age | 0.03 | 0.003 | 8.87 | <0.001 | 1.03 | (1.02, 1.03) | |
| waist_circumference | 0.02 | 0.003 | 5.24 | <0.001 | 1.02 | (1.01, 1.02) | |
| Note: | |||||||
| a The reference group is normal |
17 Interpretation
17.1 HbA1c
Diabetes vs Normal:
Every increment of 1 unit in HbA1c, controlling for other factors, increases the odds of being in the diabetes group (in comparison to the normal group) by 1.24 (Adjusted RRR = 3.47, 95% CI: 3.09, 3.90, p-value <0.001). The 95% confidence interval (3.09, 3.90) is narrow and does not include 1, indicating a statistically significant increase in the odds of being in the diabetes group as HbA1c increases.Prediabetes vs Normal:
Every increment of 1 unit in HbA1c, controlling for other factors, increases the odds of being in the prediabetes group (in comparison to the normal group) by 0.48 (Adjusted RRR = 1.62, 95% CI: 1.45, 1.81, p-value <0.001). The 95% confidence interval (1.45, 1.81) is narrow and does not include 1, indicating a statistically significant increase in the odds of being in the prediabetes group as HbA1c increases.
17.2 Triglycerides
Diabetes vs Normal:
Every increment of 1 unit in triglyceride level, controlling for other factors, increases the odds of being in the diabetes group (in comparison to the normal group) by 0.33 (Adjusted RRR = 1.39, 95% CI: 1.25, 1.53, p-value <0.001). The 95% confidence interval (1.25, 1.53) is narrow and does not include 1, indicating a statistically significant increase in the odds of being in the diabetes group as triglyceride levels increase.Prediabetes vs Normal:
Every increment of 1 unit in triglyceride level, controlling for other factors, increases the odds of being in the prediabetes group (in comparison to the normal group) by 0.21 (Adjusted RRR = 1.24, 95% CI: 1.14, 1.34, p-value <0.001). The 95% confidence interval (1.14, 1.34) is narrow and does not include 1, indicating a statistically significant increase in the odds of being in the prediabetes group as triglyceride levels increase.
17.3 Age
Diabetes vs Normal:
Every increment of 1 year in age, controlling for other factors, increases the odds of being in the diabetes group (in comparison to the normal group) by 0.02 (Adjusted RRR = 1.02, 95% CI: 1.01, 1.03, p-value <0.001). The 95% confidence interval (1.01, 1.03) is narrow and does not include 1, indicating a statistically significant association between older age and higher odds of diabetes.Prediabetes vs Normal:
Every increment of 1 year in age, controlling for other factors, increases the odds of being in the prediabetes group (in comparison to the normal group) by 0.03 (Adjusted RRR = 1.03, 95% CI: 1.02, 1.03, p-value <0.001). The 95% confidence interval (1.02, 1.03) is narrow and does not include 1, indicating a statistically significant increase in the odds of prediabetes with age.
17.4 Waist Circumference
Diabetes vs Normal:
Every increment of 1 cm in waist circumference, controlling for other factors, increases the odds of being in the diabetes group (in comparison to the normal group) by 0.01 (Adjusted RRR = 1.01, 95% CI: 1.00, 1.02, p-value = 0.006). The 95% confidence interval (1.00, 1.02) is narrow and does not include 1, indicating a statistically significant association between larger waist circumference and increased odds of diabetes.Prediabetes vs Normal:
Every increment of 1 cm in waist circumference, controlling for other factors, increases the odds of being in the prediabetes group (in comparison to the normal group) by 0.02 (Adjusted RRR = 1.02, 95% CI: 1.01, 1.02, p-value <0.001). The 95% confidence interval (1.01, 1.02) is narrow and does not include 1, indicating a statistically significant association between larger waist circumference and increased odds of prediabetes.